In [1]:
import numpy as np
import pandas as pd
In [2]:
print(pd.Series( [1, 5, 2, 3, 6, 4] ) )
In [3]:
s = pd.Series(range(5), index=list('abcde'))
print('b' in s) # checks index values
print(s.isin([2])) # checks in values
In [4]:
print(s)
s = s.reindex(list('afg'))
s # due to NaN, whole series dtype changed to float. To escape from such instances, make the series object dtype initially.
Out[4]:
In [5]:
s_obj = pd.Series(range(5), index=list('abcde'), dtype=np.object)
print(s_obj)
s_obj = s_obj.reindex(list('afg'))
s_obj # No change in the dtype
Out[5]:
In [6]:
# 1. Using list of dictionary
lst = [{"C1": 1, "C2": 2},
{"C1": 5, "C2": 10, "C3": 20}]
In [7]:
# Observe NaN
print(pd.DataFrame(lst, index = ["R1", "R2"]))
In [8]:
# 2. Using list (Commonly used)
lst = {"C1": ["1", "3"],
"C2": ["2","4"]}
print( pd.DataFrame(lst, index = ["R1", "R2"]) )
In [9]:
# Defining Column name within df function
print(pd.DataFrame(np.random.randn(2,2),
index = list('pq'),
columns = list('ab')))
In [10]:
df = pd.DataFrame({'A': [10., 20.],
'B': "text",
'C': [2,60],
'D': 3+9j})
print(df)
In [11]:
# Observe final dtype as OBJECT to preserve every
# element's data type information.
print(df.dtypes)
In [12]:
print(df.info()) # memory usage doesn't include object dtypes columns
In [13]:
df.info(memory_usage='deep') # with deep even object dtypes are included
In [14]:
print(pd.DataFrame(np.random.randn(20,2)).head())
In [15]:
print(pd.DataFrame(np.random.randn(20,2)).tail(5))
In [16]:
df = pd.DataFrame({'A': [10., 20.],
'B': "text",
'C': [2,60],
'D': 3+9j}, index = list('PQ'))
In [17]:
print((df.index)) #index of rows
In [18]:
print(df.columns) # index of columns
In [19]:
print(df.values) # display values of df
In [20]:
df = pd.DataFrame({'A': [1,4,7],
'B': [2,5,8],
'C': [3,6,9]},
index = list('PQR'))
print(df)
In [21]:
# Transpose
print(df.T)
In [22]:
# shape, size and dimensions
print(df.shape, df.size, df.ndim)
In [23]:
df1 = pd.DataFrame({'A': [1,4,7],
'B': [2,5,8],
'C': [3,6,9]},
index = list('PQR'))
df_temp = df1
df2 = pd.DataFrame({'A': [-1,4,7],
'B': [2,-5,8],
'C': [3,6,-9]},
index = list('PQR'))
df3 = pd.DataFrame({'A': [1,-4,-7],
'B': [-2,5,-8],
'C': [-3,-6,9]},
index = list('STU'))
In [24]:
print(df1.add(df2))
In [25]:
print(df1.sub(df3))
In [26]:
print(df_temp.append(df2))
In [27]:
print(df3.apply(np.abs, axis = 1))
In [28]:
df3
Out[28]:
In [29]:
df3.apply(np.sum, axis = 0)
Out[29]:
In [30]:
print(type(df3.as_matrix()))
In [31]:
df1
Out[31]:
In [32]:
print(df1.count()) #by default -> col
In [33]:
print(df1.count(axis = 1)) # row
In [34]:
print(df1.sub(df3).count()) # nan in all col
In [35]:
df = pd.DataFrame(np.nan,columns = list('AB'),
index = list('CD'))
print(df)
In [36]:
print(df.isnull())
In [37]:
print(df1.sub(df3).fillna(5))
In [38]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5]],
columns=list('ABCD'))
print(df)
In [39]:
print(df.dropna(axis = 1, how = 'all'))
In [40]:
print(df.dropna(axis = 1, how='any'))
In [41]:
df = pd.DataFrame([[1, 2, 3],
[1, 2, 3],
[4, 5, 6],
[1, 2, 3]], columns =list('ABC'))
# representing which rows are duplicated
print(df.duplicated())
In [42]:
df = pd.DataFrame([["hi", 2, 3],
[1, 20, 3+3j],
[4, "world", 6]],
columns =list('ABC'))
print(df)
In [43]:
print(df.isin( [20, 6+0j, 'hi'] ))
In [44]:
df = pd.DataFrame([[15, 12],
[33, 54],
[10, 32]],
index = list('ABC'),
columns = list('DE'))
print(df)
In [45]:
print(df.loc[['A','C'],:])
In [46]:
print(df.loc[:,'E'])
In [47]:
print(df.loc['B', 'D'])
In [48]:
df = pd.DataFrame([[15, 12],
[33, 54],
[10, 32]])
print(df)
In [49]:
print(df.iloc[[0,2],:])
In [50]:
print(df.iloc[:,0:1])
In [51]:
print(df.iloc[:,:])
In [52]:
df = pd.DataFrame([[15, 12],
[33, 54],
[10, 32]],
index = ['one','two','three'],
columns = ['col1', 'col2'])
print(df)
In [53]:
print(df.filter(regex = 'e$', axis = 0))
In [54]:
print(df.filter(regex = '^c', axis = 1))
In [55]:
df = pd.DataFrame([[15, 12],
[33, 54],
[10, 32]])
df[df < 15] = 10
df
Out[55]:
In [56]:
df1 = pd.DataFrame([[15, 12],
[33, 54],
[10, 32]],
columns = list('AB'))
df2 = pd.DataFrame([[15, 12, -3],
[33, 54, 21],
[10, 32, 22]],
columns = list('ABC'))
df3 = pd.DataFrame([[10, 1, 3],
[33, -54, 2],
[10, 0.32, 2]],
columns = list('ABC'))
print(df1)
In [57]:
print(df1.eval('B - A'))
In [58]:
df1.eval('C = B - A', inplace=False) # < C = > is optional | inplace=True doesn't return output, rather changes orig.
Out[58]:
In [59]:
loc_var = np.arange(3)
df1.eval('C = A + @loc_var', inplace=False) # @ allows to use external variables
Out[59]:
In [60]:
print(df1)
In [61]:
print(df1.equals(df2)) # On broader scale
In [62]:
print(df2.eq(df3))
In [63]:
print(df2.le(df3))
In [64]:
print(df2.ne(df3))
In [65]:
print(pd.concat( [df2,df3], axis = 0 ))
In [66]:
print(pd.concat( [df2,df3], axis = 1 ))
In [67]:
df = pd.DataFrame([
['PID0', 'Gold', '1$', '2£'],
['PID0', 'Bronze', '2$', '4£'],
['PID1', 'Gold', '3$', '6£'],
['PID1', 'Silver', '4$', '8£'],
], columns = ['PID', 'CType',
'USD', 'PND'])
print(df)
In [68]:
print(df.pivot(index = 'PID', columns = 'CType'))
In [69]:
print(df.pivot(index = 'PID', columns = 'CType',
values = 'USD'))
In [70]:
df = pd.DataFrame([
['PID0', 'Gold', '1$'],
['PID0', 'Bronze', '2$'],
['PID0', 'Gold', '3$'], #Duplicate
['PID1', 'Silver', '4$'],
], columns = ['PID', 'CType',
'USD'])
In [71]:
# Ambiguity in saving value to PID0_Gold
# on values 1$ and 3$
print(df)
In [72]:
print(df.pivot_table(index = 'PID',
columns = 'CType',
aggfunc = np.max))
In [73]:
multi_ind = pd.MultiIndex.from_tuples(
[('IND','R1'),
('IND','R2'),
('US','R1'),
('US','R2')],
names = [
'1st',
'2nd'])
df = pd.DataFrame(np.random.randn(4,2),
index = multi_ind,
columns = ['C1', 'C2'])
print(df)
In [74]:
print(df.stack()) # increase in length
In [75]:
print(df.unstack()) # increase in width
In [76]:
df = pd.DataFrame([["B", 1],
["B", 2],
["A", 3],
["A", 4]], columns = list('XY'))
print(df)
In [77]:
print(df.groupby(['X']).sum())
In [78]:
print(df.groupby(['X'], sort = False).sum())
In [79]:
df = pd.DataFrame({'A' : list('ppppqqqq'),
'B' : list('rrssrrss'),
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
In [80]:
print(df.groupby(['A', 'B']).agg(
{'C': {'C_mean': 'mean'},
'D': {'D_median': 'median'}
}))
In [81]:
df = pd.DataFrame([[11, 202],
[33, 44]],
index = list('AB'),
columns = list('CD'))
In [82]:
df.to_excel('pd_df.xlsx', sheet_name = 'Sheet1')
In [83]:
print(pd.read_excel('pd_df.xlsx', 'Sheet1'))